{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Configuring pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import numpy and pandas\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# used for dates\n",
    "import datetime\n",
    "from datetime import datetime, date\n",
    "\n",
    "# Set some pandas options controlling output format\n",
    "pd.set_option('display.notebook_repr_html', False)\n",
    "pd.set_option('display.max_columns', 8)\n",
    "pd.set_option('display.max_rows', 10)\n",
    "pd.set_option('display.width', 60)\n",
    "\n",
    "# bring in matplotlib for graphics\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Working with missing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   c1  c2  c3\n",
       "a   0   1   2\n",
       "b   3   4   5\n",
       "c   6   7   8\n",
       "d   9  10  11\n",
       "e  12  13  14"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a DataFrame with 5 rows and 3 columns\n",
    "df = pd.DataFrame(np.arange(0, 15).reshape(5, 3), \n",
    "               index=['a', 'b', 'c', 'd', 'e'], \n",
    "               columns=['c1', 'c2', 'c3'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     c1    c2    c3    c4  c5\n",
       "a   0.0   1.0   2.0  20.0 NaN\n",
       "b   3.0   4.0   5.0   NaN NaN\n",
       "c   6.0   7.0   8.0   NaN NaN\n",
       "d   9.0  10.0  11.0   NaN NaN\n",
       "e  12.0  13.0  14.0   NaN NaN\n",
       "f  15.0  16.0  17.0  18.0 NaN\n",
       "g   NaN   NaN   NaN   NaN NaN"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# add some columns and rows to the DataFrame\n",
    "# column c4 with NaN values\n",
    "df['c4'] = np.nan\n",
    "# row 'f' with 15 through 18 \n",
    "df.loc['f'] = np.arange(15, 19) \n",
    "# row 'g' will all NaN\n",
    "df.loc['g'] = np.nan\n",
    "# column 'C5' with NaN's\n",
    "df['c5'] = np.nan\n",
    "# change value in col 'c4' row 'a'\n",
    "df['c4']['a'] = 20\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Determining NaN values in Series and DataFrame objects"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "      c1     c2     c3     c4    c5\n",
       "a  False  False  False  False  True\n",
       "b  False  False  False   True  True\n",
       "c  False  False  False   True  True\n",
       "d  False  False  False   True  True\n",
       "e  False  False  False   True  True\n",
       "f  False  False  False  False  True\n",
       "g   True   True   True   True  True"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# which items are NaN?\n",
    "df.isnull()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "c1    1\n",
       "c2    1\n",
       "c3    1\n",
       "c4    5\n",
       "c5    7\n",
       "dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# count the number of NaN's in each column\n",
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# total count of NaN values\n",
    "df.isnull().sum().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "c1    6\n",
       "c2    6\n",
       "c3    6\n",
       "c4    2\n",
       "c5    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# number of non-NaN values in each column\n",
    "df.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# and this counts the number of NaN's too\n",
    "(len(df) - df.count()).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "      c1     c2     c3     c4     c5\n",
       "a   True   True   True   True  False\n",
       "b   True   True   True  False  False\n",
       "c   True   True   True  False  False\n",
       "d   True   True   True  False  False\n",
       "e   True   True   True  False  False\n",
       "f   True   True   True   True  False\n",
       "g  False  False  False  False  False"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# which items are not null?\n",
    "df.notnull()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Selecting out or dropping missing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    20.0\n",
       "f    18.0\n",
       "Name: c4, dtype: float64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# select the non-NaN items in column c4\n",
    "df.c4[df.c4.notnull()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    20.0\n",
       "f    18.0\n",
       "Name: c4, dtype: float64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# .dropna will also return non NaN values\n",
    "# this gets all non NaN items in column c4\n",
    "df.c4.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    20.0\n",
       "b     NaN\n",
       "c     NaN\n",
       "d     NaN\n",
       "e     NaN\n",
       "f    18.0\n",
       "g     NaN\n",
       "Name: c4, dtype: float64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# dropna returns a copy with the values dropped\n",
    "# the source DataFrame / column is not changed\n",
    "df.c4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [c1, c2, c3, c4, c5]\n",
       "Index: []"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# on a DataFrame this will drop entire rows\n",
    "# where there is at least one NaN\n",
    "# in this case, that is all rows\n",
    "df.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     c1    c2    c3    c4  c5\n",
       "a   0.0   1.0   2.0  20.0 NaN\n",
       "b   3.0   4.0   5.0   NaN NaN\n",
       "c   6.0   7.0   8.0   NaN NaN\n",
       "d   9.0  10.0  11.0   NaN NaN\n",
       "e  12.0  13.0  14.0   NaN NaN\n",
       "f  15.0  16.0  17.0  18.0 NaN"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# using how='all', only rows that have all values\n",
    "# as NaN will be dropped\n",
    "df.dropna(how = 'all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     c1    c2    c3    c4\n",
       "a   0.0   1.0   2.0  20.0\n",
       "b   3.0   4.0   5.0   NaN\n",
       "c   6.0   7.0   8.0   NaN\n",
       "d   9.0  10.0  11.0   NaN\n",
       "e  12.0  13.0  14.0   NaN\n",
       "f  15.0  16.0  17.0  18.0\n",
       "g   NaN   NaN   NaN   NaN"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# flip to drop columns instead of rows\n",
    "df.dropna(how='all', axis=1) # say goodbye to c5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     c1    c2    c3    c4  c5\n",
       "a   0.0   1.0   2.0  20.0 NaN\n",
       "b   3.0   4.0   5.0   NaN NaN\n",
       "c   6.0   7.0   8.0   NaN NaN\n",
       "d   9.0  10.0  11.0   NaN NaN\n",
       "e  12.0  13.0  14.0   NaN NaN\n",
       "f  15.0  16.0  17.0  18.0 NaN\n",
       "g   0.0   NaN   0.0   NaN NaN"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make a copy of df\n",
    "df2 = df.copy()\n",
    "# replace two NaN cells with values\n",
    "df2.loc['g'].c1 = 0\n",
    "df2.loc['g'].c3 = 0\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     c1    c3\n",
       "a   0.0   2.0\n",
       "b   3.0   5.0\n",
       "c   6.0   8.0\n",
       "d   9.0  11.0\n",
       "e  12.0  14.0\n",
       "f  15.0  17.0\n",
       "g   0.0   0.0"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# now drop columns with any NaN values\n",
    "df2.dropna(how='any', axis=1) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     c1    c2    c3\n",
       "a   0.0   1.0   2.0\n",
       "b   3.0   4.0   5.0\n",
       "c   6.0   7.0   8.0\n",
       "d   9.0  10.0  11.0\n",
       "e  12.0  13.0  14.0\n",
       "f  15.0  16.0  17.0\n",
       "g   NaN   NaN   NaN"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# only drop columns with at least 5 NaN values\n",
    "df.dropna(thresh=5, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# How pandas handles NaN’s in mathematical operations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(nan, 2.0)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a NumPy array with one NaN value\n",
    "a = np.array([1, 2, np.nan, 3])\n",
    "# create a Series from the array\n",
    "s = pd.Series(a)\n",
    "# the mean of each is different\n",
    "a.mean(), s.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(38.0,)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# demonstrate sum, mean and cumsum handling of NaN\n",
    "# get one column\n",
    "s = df.c4\n",
    "s.sum(), # NaN's treated as 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "19.0"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.mean() # NaN also treated as 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    20.0\n",
       "b     NaN\n",
       "c     NaN\n",
       "d     NaN\n",
       "e     NaN\n",
       "f    38.0\n",
       "g     NaN\n",
       "Name: c4, dtype: float64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# as 0 in the cumsum, but NaN's preserved in result Series\n",
    "s.cumsum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    21.0\n",
       "b     NaN\n",
       "c     NaN\n",
       "d     NaN\n",
       "e     NaN\n",
       "f    19.0\n",
       "g     NaN\n",
       "Name: c4, dtype: float64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# in arithmetic, a NaN value will result in NaN\n",
    "df.c4 + 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Filling in missing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     c1    c2    c3    c4   c5\n",
       "a   0.0   1.0   2.0  20.0  0.0\n",
       "b   3.0   4.0   5.0   0.0  0.0\n",
       "c   6.0   7.0   8.0   0.0  0.0\n",
       "d   9.0  10.0  11.0   0.0  0.0\n",
       "e  12.0  13.0  14.0   0.0  0.0\n",
       "f  15.0  16.0  17.0  18.0  0.0\n",
       "g   0.0   0.0   0.0   0.0  0.0"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# return a new DataFrame with NaN's filled with 0\n",
    "filled = df.fillna(0)\n",
    "filled"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "c1     7.5\n",
       "c2     8.5\n",
       "c3     9.5\n",
       "c4    19.0\n",
       "c5     NaN\n",
       "dtype: float64"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# NaN's don't count as an item in calculating\n",
    "# the means\n",
    "df.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "c1    6.428571\n",
       "c2    7.285714\n",
       "c3    8.142857\n",
       "c4    5.428571\n",
       "c5    0.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# having replaced NaN with 0 can make\n",
    "# operations such as mean have different results\n",
    "filled.mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Forward and backwards filling of missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    20.0\n",
       "b    20.0\n",
       "c    20.0\n",
       "d    20.0\n",
       "e    20.0\n",
       "f    18.0\n",
       "g    18.0\n",
       "Name: c4, dtype: float64"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# extract the c4 column and fill NaNs forward\n",
    "df.c4.fillna(method=\"ffill\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    20.0\n",
       "b    18.0\n",
       "c    18.0\n",
       "d    18.0\n",
       "e    18.0\n",
       "f    18.0\n",
       "g     NaN\n",
       "Name: c4, dtype: float64"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# perform a backwards fill\n",
    "df.c4.fillna(method=\"bfill\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Filling using index labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    100\n",
       "e    101\n",
       "g    102\n",
       "dtype: int64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a new Series of values to be \n",
    "# used to fill NaN's where index label matches\n",
    "fill_values = pd.Series([100, 101, 102], index=['a', 'e', 'g'])\n",
    "fill_values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a     20.0\n",
       "b      NaN\n",
       "c      NaN\n",
       "d      NaN\n",
       "e    101.0\n",
       "f     18.0\n",
       "g    102.0\n",
       "Name: c4, dtype: float64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# using c4, fill using fill_values\n",
    "# a, e and g will be filled with matching values\n",
    "df.c4.fillna(fill_values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     c1    c2    c3    c4  c5\n",
       "a   0.0   1.0   2.0  20.0 NaN\n",
       "b   3.0   4.0   5.0  19.0 NaN\n",
       "c   6.0   7.0   8.0  19.0 NaN\n",
       "d   9.0  10.0  11.0  19.0 NaN\n",
       "e  12.0  13.0  14.0  19.0 NaN\n",
       "f  15.0  16.0  17.0  18.0 NaN\n",
       "g   7.5   8.5   9.5  19.0 NaN"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# fill NaN values in each column with the \n",
    "# mean of the values in that column\n",
    "df.fillna(df.mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Interpolation of missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1.00\n",
       "1    1.25\n",
       "2    1.50\n",
       "3    1.75\n",
       "4    2.00\n",
       "dtype: float64"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# linear interpolate the NaN values from 1 through 2\n",
    "s = pd.Series([1, np.nan, np.nan, np.nan, 2])\n",
    "s.interpolate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2014-01-01    1.0\n",
       "2014-02-01    NaN\n",
       "2014-04-01    2.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a time series, but missing one date in the Series\n",
    "ts = pd.Series([1, np.nan, 2], \n",
    "            index=[datetime(2014, 1, 1), \n",
    "                   datetime(2014, 2, 1),                   \n",
    "                   datetime(2014, 4, 1)])\n",
    "ts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2014-01-01    1.0\n",
       "2014-02-01    1.5\n",
       "2014-04-01    2.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# linear interpolate based on number of items in the Series\n",
    "ts.interpolate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2014-01-01    1.000000\n",
       "2014-02-01    1.344444\n",
       "2014-04-01    2.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# this accounts for the fact that we don't have\n",
    "# an entry for 2014-03-01\n",
    "ts.interpolate(method=\"time\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       0.0\n",
       "1       NaN\n",
       "10    100.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# a Series to demonstrate index label based interpolation\n",
    "s = pd.Series([0, np.nan, 100], index=[0, 1, 10])\n",
    "s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       0.0\n",
       "1      50.0\n",
       "10    100.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# linear interpolate\n",
    "s.interpolate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0       0.0\n",
       "1      10.0\n",
       "10    100.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# interpolate based upon the values in the index\n",
    "s.interpolate(method=\"values\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Handling Duplicate Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   a  b\n",
       "0  x  1\n",
       "1  x  1\n",
       "2  x  2\n",
       "3  y  3\n",
       "4  y  3\n",
       "5  y  4\n",
       "6  y  4"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# a DataFrame with lots of duplicate data\n",
    "data = pd.DataFrame({'a': ['x'] * 3 + ['y'] * 4, \n",
    "                     'b': [1, 1, 2, 3, 3, 4, 4]})\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1     True\n",
       "2    False\n",
       "3    False\n",
       "4     True\n",
       "5    False\n",
       "6     True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# reports which rows are duplicates based upon\n",
    "# if the data in all columns was seen before\n",
    "data.duplicated()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   a  b\n",
       "0  x  1\n",
       "2  x  2\n",
       "3  y  3\n",
       "5  y  4"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# drop duplicate rows retaining first row of the duplicates\n",
    "data.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   a  b\n",
       "1  x  1\n",
       "2  x  2\n",
       "4  y  3\n",
       "6  y  4"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# drop duplicate rows, only keeping the first \n",
    "# instance of any data\n",
    "data.drop_duplicates(keep='last')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1    False\n",
       "2    False\n",
       "3    False\n",
       "4    False\n",
       "5    False\n",
       "6    False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# add a column c with values 0..6\n",
    "# this makes .duplicated() report no duplicate rows\n",
    "data['c'] = range(7)\n",
    "data.duplicated()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   a  b  c\n",
       "0  x  1  0\n",
       "2  x  2  2\n",
       "3  y  3  3\n",
       "5  y  4  5"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# but if we specify duplicates to be dropped only in columns a & b\n",
    "# they will be dropped\n",
    "data.drop_duplicates(['a', 'b'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Mapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "one      1\n",
       "three    3\n",
       "two      2\n",
       "dtype: int64"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create two Series objects to demonstrate mapping\n",
    "x = pd.Series({\"one\": 1, \"two\": 2, \"three\": 3})\n",
    "y = pd.Series({1: \"a\", 2: \"b\", 3: \"c\"})\n",
    "x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    a\n",
       "2    b\n",
       "3    c\n",
       "dtype: object"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "one      a\n",
       "three    c\n",
       "two      b\n",
       "dtype: object"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# map values in x to values in y \n",
    "x.map(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "one        a\n",
       "three    NaN\n",
       "two        b\n",
       "dtype: object"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# three in x will not align / map to a value in y\n",
    "x = pd.Series({\"one\": 1, \"two\": 2, \"three\": 3})\n",
    "y = pd.Series({1: \"a\", 2: \"b\"})\n",
    "x.map(y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Replacing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.0\n",
       "1    1.0\n",
       "2    2.0\n",
       "3    3.0\n",
       "4    2.0\n",
       "5    4.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a Series to demonstrate replace\n",
    "s = pd.Series([0., 1., 2., 3., 2., 4.])\n",
    "s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.0\n",
       "1    1.0\n",
       "2    5.0\n",
       "3    3.0\n",
       "4    5.0\n",
       "5    4.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# replace all items with index label 2 with value 5\n",
    "s.replace(2, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    4.0\n",
       "1    3.0\n",
       "2    2.0\n",
       "3    1.0\n",
       "4    2.0\n",
       "5    0.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# replace all items with new values\n",
    "s.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     10.0\n",
       "1    100.0\n",
       "2      2.0\n",
       "3      3.0\n",
       "4      2.0\n",
       "5      4.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# replace using entries in a dictionary\n",
    "s.replace({0: 10, 1: 100})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   a  b\n",
       "0  0  5\n",
       "1  1  6\n",
       "2  2  7\n",
       "3  3  8\n",
       "4  4  9"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# DataFrame with two columns\n",
    "df = pd.DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "     a    b\n",
       "0    0    5\n",
       "1  100    6\n",
       "2    2    7\n",
       "3    3  100\n",
       "4    4    9"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# specify different replacement values for each column\n",
    "df.replace({'a': 1, 'b': 8}, 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    10.0\n",
       "1     1.0\n",
       "2     2.0\n",
       "3     3.0\n",
       "4     2.0\n",
       "5     4.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# demonstrate replacement with pad method\n",
    "# set first item to 10, to have a distinct replacement value\n",
    "s[0] = 10\n",
    "s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    10.0\n",
       "1    10.0\n",
       "2    10.0\n",
       "3    10.0\n",
       "4    10.0\n",
       "5     4.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# replace items with index label 1, 2, 3, using fill from the\n",
    "# most recent value prior to the specified labels (10)\n",
    "s.replace([1, 2, 3], method='pad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Applying functions to transform data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0\n",
       "1    2\n",
       "2    4\n",
       "3    6\n",
       "4    8\n",
       "dtype: int64"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# demonstrate applying a function to every item of a Series\n",
    "s = pd.Series(np.arange(0, 5))\n",
    "s.apply(lambda v: v * 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   a   b   c\n",
       "0  0   1   2\n",
       "1  3   4   5\n",
       "2  6   7   8\n",
       "3  9  10  11"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# demonstrate applying a sum on each column\n",
    "df = pd.DataFrame(np.arange(12).reshape(4, 3), \n",
    "                  columns=['a', 'b', 'c'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    18\n",
       "b    22\n",
       "c    26\n",
       "dtype: int64"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate cumulative sum of items in each column\n",
    "df.apply(lambda col: col.sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     3\n",
       "1    12\n",
       "2    21\n",
       "3    30\n",
       "dtype: int64"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate sum of items in each row\n",
    "df.apply(lambda row: row.sum(), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   a   b   c  interim\n",
       "0  0   1   2        0\n",
       "1  3   4   5       12\n",
       "2  6   7   8       42\n",
       "3  9  10  11       90"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a new column 'interim' with a * b\n",
    "df['interim'] = df.apply(lambda r: r.a * r.b, axis=1)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   a   b   c  interim  result\n",
       "0  0   1   2        0       2\n",
       "1  3   4   5       12      17\n",
       "2  6   7   8       42      50\n",
       "3  9  10  11       90     101"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# and now a 'result' column with 'interim' + 'c'\n",
    "df['result'] = df.apply(lambda r: r.interim + r.c, axis=1)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "    a   b   c  interim  result\n",
       "0   3   1   2        0       2\n",
       "1  12   4   5       12      17\n",
       "2  21   7   8       42      50\n",
       "3  30  10  11       90     101"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# replace column a with the sum of columns a, b and c\n",
    "df.a = df.a + df.b + df.c\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "    0   1     2   3   4\n",
       "0   0   1   2.0   3   4\n",
       "1   5   6   NaN   8   9\n",
       "2  10  11  12.0  13  14"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a 3x5 DataFrame\n",
    "# only second row has a NaN\n",
    "df = pd.DataFrame(np.arange(0, 15).reshape(3,5))\n",
    "df.loc[1, 2] = np.nan\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    10.0\n",
       "2    60.0\n",
       "dtype: float64"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# demonstrate applying a function to only rows having\n",
    "# a count of 0 NaN values\n",
    "df.dropna().apply(lambda x: x.sum(), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "       0      1      2      3      4\n",
       "0   0.00   1.00   2.00   3.00   4.00\n",
       "1   5.00   6.00    nan   8.00   9.00\n",
       "2  10.00  11.00  12.00  13.00  14.00"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use applymap to format all items of the DataFrame\n",
    "df.applymap(lambda x: '%.2f' % x)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
